library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(expss)
library(mgcv)
library(rstatix)
library(ROCR)
library(knitr) ; library(kableExtra)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
load('../Data/preprocessedData/classification_dataset.RData')
load('../Data/Results/Ridge_regression.RData')
Work in fair classification can be categorised into three approaches:
After the model has been trained with the bias, perform a post-processing of the classifier outputs. This approach is quite simple to implement but has some downsides:
It has limited flexibility
Decoupling the training and calibration can lead to models with poor accuracy tradeoff (when training your model it may be focusing on the bias, in our case mean expression, and overlooking more important aspects of your data, such as biological significance)
Transforming the problem into a constrained optimisation problem (fairness as the constraint) using Lagrange multipliers.
Some of the downsides of this approach are:
The fairness constraints are often irregular and have to be relaxed in order to optimise
Training can be difficult, the Lagrangian may not even have a solution to converge to
Constrained optimisation can be inherently unstable
It can overfit and have poor fairness generalisation
It often yields poor trade-offs in fairness and accuracy
These approaches primarily involve “massaging” the data to remove bias.
Some downsides are:
Note: In earlier versions of this code, I implemented this approach by trying to remove the level of expression signal from each feature of the dataset (since the Module Membership features capture the bias in an indirect way), but removing the mean expression signal modified the module membership of the genes in big ways sometimes and it didn’t seem to solve the problem in the end, so this proved not to be very useful and wasn’t implemented in this final version
Since the effect of the bias is proportional to the mean level of expression of a gene, it can be corrected by fitting a curve to the relation between probability and mean expression of the genes in the dataset, and using the residuals of the fit to represent the unbiased output of the classifier.
plot_data = ridge_predictions %>% dplyr::select(ID, prob) %>%
left_join(genes_info %>% dplyr::select(ID, meanExpr), by = 'ID')
lm_fit = lm(prob ~ meanExpr, data = plot_data)
plot_data = plot_data %>% mutate(lm_res = lm_fit$residuals)
p1 = plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='lm', color='gray', alpha=0.1) + xlab('Mean Expression') + ylab('Probability') +
theme_minimal() #+ ggtitle('Linear Fit')
p2 = plot_data %>% ggplot(aes(meanExpr, lm_res)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='gam', color='gray', alpha=0.1, linetype = 'dashed') +
geom_smooth(method='lm', color='gray', alpha=0.1) +
xlab('Mean Expression') + ylab('Residuals') + theme_minimal()
grid.arrange(p1, p2, nrow=1)
rm(p1, p2)
The linear fit was an \(R^2\) of 0.003
A more flexible way of modelling the bias in the predictions is using Generalised Additive Models (GAMs), which use linear regression at its core, but incorporate smooth functions that allow it to model nonlinear relationships between the variables.
gam_fit = gam(prob ~ s(meanExpr), method = 'REML', data = plot_data)
plot_data = plot_data %>% mutate(gam_res = gam_fit$residuals)
p1 = plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='gam', color='gray', alpha=0.1) + xlab('Mean Expression') + ylab('Probability') +
theme_minimal()
p2 = plot_data %>% ggplot(aes(meanExpr, gam_res)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='gam', color='gray', alpha=0.1, linetype = 'dashed') +
xlab('Mean Expression') + ylab('Residuals') + theme_minimal()
grid.arrange(p1, p2, nrow=1)
rm(lm_fit, p1, p2)
The GAM fit was an \(R^2\) of 0.0206
GAMs are selected over a linear model because of their ability to accurately model and remove the bias in the dataset. The final step to use the residuals as unbiased probabilities is to re-scale them so all the genes have values between 0 and 1. Since the average value of the residuals is zero, this is accomplished by adding to each residual the average value of the probabilities of the Ridge regression model. This way, not only can the new scores be interpreted as probabilities, but the new unbiased model will have the same mean probability as the original one.
GAM_predictions = plot_data %>% mutate(prob = gam_res + mean(plot_data$prob)) %>% dplyr::select(ID, prob, meanExpr) %>%
mutate(prob = ifelse(prob<0, 0, prob), pred = prob > 0.5) %>%
left_join(ridge_predictions %>% dplyr::select(ID, hgnc_symbol, SFARI), by = 'ID')
# Plot results
GAM_predictions %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.2, size = 0.5) + xlab('Mean Expression') +
ylab('GAM corrected probability') + #ggtitle('Mean expression vs model probability by gene') +
theme_minimal()
rm(gam_fit)
NOTE: The Ridge regression needs to be run 100 times again, which takes a long time. Because of this, this model is not run here in the notebook, but instead in the script train_ridge_regression_w_GAM_correction.R in the supportingScripts folder.
load('../Data/Results/Ridge_regression_w_GAM_correction.RData')
pm = data.frame('mean_train' = GAM_pm_train$mean, 'sd_train' = GAM_pm_train$sd,
'mean_val' = GAM_pm_val$mean, 'sd_val' = GAM_pm_val$sd,
'mean_test' = GAM_pm_test$mean, 'sd_test' = GAM_pm_test$sd)
rownames(pm) = c('Accuracy','Precision','Recall','F1','AUC','MLP','Balanced Accuracy')
kable(pm %>% round(2), col.names = c('Train (mean)','Train (SD)','Validation (mean)', 'Validation (SD)',
'Test (mean)','Test (SD)'))
| Train (mean) | Train (SD) | Validation (mean) | Validation (SD) | Test (mean) | Test (SD) | |
|---|---|---|---|---|---|---|
| Accuracy | 0.64 | 0.01 | 0.77 | 0.01 | 0.77 | 0.01 |
| Precision | 0.62 | 0.01 | 0.09 | 0.01 | 0.09 | 0.01 |
| Recall | 0.44 | 0.02 | 0.42 | 0.04 | 0.42 | 0.04 |
| F1 | 0.52 | 0.02 | 0.15 | 0.01 | 0.14 | 0.01 |
| AUC | 0.68 | 0.01 | 0.65 | 0.03 | 0.65 | 0.03 |
| MLP | 2.30 | 0.08 | 14.00 | 5.96 | 14.87 | 5.96 |
| Balanced Accuracy | 0.62 | 0.01 | 0.60 | 0.02 | 0.60 | 0.02 |
rm(pm)
plot_data = GAM_predictions %>% left_join(genes_info %>% dplyr::select(ID, gene.score), by = 'ID') %>%
mutate(SFARI_genes = ifelse(gene.score %in% c('Neuronal','Others'), as.character(gene.score), 'SFARI')) %>%
mutate(SFARI_genes = factor(SFARI_genes, levels = c('SFARI','Neuronal','Others'))) %>%
apply_labels(gene.score='SFARI Gene score')
wt = plot_data %>% wilcox_test(prob~SFARI_genes, p.adjust.method = 'BH') %>% add_x_position(x = 'group')
increase = 0.06
base = 0.85
pos_y_comparisons = c(base, base+increase, base)
p1 = plot_data %>% ggplot(aes(SFARI_genes, prob)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=SFARI_genes)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Probability') + #ggtitle('Distribution of probabilities by category') +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position = 'none')
wt = plot_data %>% wilcox_test(prob~gene.score, p.adjust.method = 'BH') %>% add_x_position(x = 'group')
increase = 0.05
base = 0.85
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p2 = plot_data %>% ggplot(aes(gene.score, prob)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('') + ylab('Probability') +
#ggtitle('Distribution of probabilities by SFARI score') +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1, p2, nrow = 1)
rm(mean_vals, increase, base, pos_y_comparisons, wt, p1, p2)
The fact that the model continues to assign significantly different probabilities to the SFARI Scores even after removing the bias related to mean level of expression using the post-processing approach could mean that there is some other aspect characterising the genes with high Scores as SFARI Genes unrelated to their higher level of expression, but it could also mean that this bias correction method is only removing the bias superficially and that it still plays an indirect role in the characterisation of the genes.
To examine which of these two options is more likely, a second bias correction technique is implemented.
The original formula for the Demographic Parity bias is
$c(x,0) = 0 $ when the prediction is negative
\(c(x,1) = \frac{g(x)}{Z_G}-1\) when the prediction is positive. Where \(g(x)\) is the Kronecker delta to indicate if the sample belongs to the protected group and \(Z_G\) is the proportion of the population that belongs to the group we want to protect against bias
Using this definitions in our problem:
\(g(x):\) Since all our samples belong to the protected group, this would always be 1
\(Z_G:\) Since all of our samples belong to the protected group, this would also always be 1
So our measure of bias \(c(x,1) = \frac{1}{1}-1 = 0\) for all samples. This doesn’t work, so we need to adapt it to our continous case
We can use \(c(x,1) = (x-mean(meanExpr(D)))/sd(meanExpr(D))\) as the constraint function, this way, when we calculate the bias of the dataset:
\(h(x)\cdot c(x)\) will only be zero if the positive samples are balanced around the mean expression, and the sign of the bias will indicate the direction of the bias
Pseudocode:
lambda = 0
w = [1, ..., 1]
c = (x-mean(meanExpr(D)))/sd(meanExpr(D))
h = train classifier H with lambda and w
for t in 1,,,T do
bias = <h(x), c(x)>
update lambda to lambda - eta*bias
update weights_hat to exp(lambda*mean(c))
update weights to w_hat/(1+w_hat) if y_i=1, 1/(1+w_hat) if y_i=0
update h with new weights
Return h
NOTE: This algorithm is even heavier than the Ridge regression, since it needs to run \(T\) iterations within each of the 100 iterations, which takes a long time. Because of this, this model is not run here in the notebook, but instead in the script train_ridge_regression_w_weights_correction.R in the supportingScripts folder.
load('../Data/Results/Ridge_regression_w_weights_correction_.RData')
plot_data = data.frame('iter' = 1:length(weights_model_output$bias_vec),
'Bias' = weights_model_output$bias_vec,
'BalancedAccuracy' = weights_model_output$b_acc_vec) %>%
reshape2::melt(id.vars = 'iter') %>%
mutate(variable = ifelse(variable == 'BalancedAccuracy','Balanced accuracy', 'Bias'))
plot_data %>% ggplot(aes(x=iter, y=value, color = variable)) + geom_line() +
xlab('Iteration') + ylab('Value') + theme_minimal() + labs(color = 'Metric ') +
theme(legend.position = 'bottom')
Optimimum lambda value: -1.5246167
The weighting technique assigns weights to the samples that counteract the bias related to mean level of expression
lambda = weights_model_output$lambda
mean_expr = genes_info %>% dplyr::select(ID, meanExpr) %>% left_join(weights_predictions, by = 'ID') %>%
filter(n>0) %>% mutate('meanExpr_std' = (meanExpr-mean(meanExpr))/sd(meanExpr))
w_hat = exp(lambda*mean_expr$meanExpr_std) # inverse to mean expr
w = 1/(1+w_hat) # prop to mean expr
# update w: inv mean expr Positives, prop Negatives:
w[mean_expr$SFARI %>% as.logical] = w[mean_expr$SFARI %>% as.logical]*w_hat[mean_expr$SFARI %>% as.logical]
plot_data = data.frame('meanExpr' = mean_expr$meanExpr, 'w_hat' = w_hat, 'w' = w, 'SFARI' = mean_expr$SFARI,
'pred' = mean_expr$pred)
plot_data %>% ggplot(aes(meanExpr, w, color = SFARI)) + geom_point(alpha = 0.2) +
xlab('Mean expression') + ylab('Weight') + theme_minimal() + theme(legend.position='bottom')
rm(lambda, mean_expr, w_hat, w, plot_data)
pm = data.frame('mean_train' = weights_pm_train$mean, 'sd_train' = weights_pm_train$sd,
'mean_val' = weights_pm_val$mean, 'sd_val' = weights_pm_val$sd,
'mean_test' = weights_pm_test$mean, 'sd_test' = weights_pm_test$sd)
rownames(pm) = c('Accuracy','Precision','Recall','F1','AUC','MLP','Balanced Accuracy')
kable(pm %>% round(2), col.names = c('Train (mean)','Train (SD)','Validation (mean)', 'Validation (SD)',
'Test (mean)','Test (SD)'))
| Train (mean) | Train (SD) | Validation (mean) | Validation (SD) | Test (mean) | Test (SD) | |
|---|---|---|---|---|---|---|
| Accuracy | 0.63 | 0.01 | 0.81 | 0.02 | 0.81 | 0.02 |
| Precision | 0.65 | 0.01 | 0.09 | 0.02 | 0.09 | 0.02 |
| Recall | 0.29 | 0.04 | 0.34 | 0.05 | 0.33 | 0.05 |
| F1 | 0.40 | 0.04 | 0.15 | 0.03 | 0.14 | 0.03 |
| AUC | 0.66 | 0.01 | 0.63 | 0.03 | 0.63 | 0.03 |
| MLP | 2.26 | 0.12 | 10.23 | 6.36 | 11.55 | 6.36 |
| Balanced Accuracy | 0.59 | 0.01 | 0.59 | 0.03 | 0.58 | 0.03 |
rm(pm)
pred_ROCR = prediction(weights_predictions$prob, weights_predictions$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, auc, lift_ROCR)
The magnitude of the GS has a positive coefficient, unlike in Gandal’s dataset
cluster_table = genes_info %>% dplyr::select(Module, module_number) %>% unique()
coefs_info = weights_coefs %>% mutate('Module' = ifelse(grepl('MM.', coef), paste0('#',gsub('MM.','',coef)),
coef %>% as.character)) %>%
left_join(cluster_table, by = 'Module') %>%
mutate('features' = ifelse(is.na(module_number), Module, paste0('CM Cl ', module_number)),
'color' = ifelse(grepl('#', Module), Module, 'gray')) %>% arrange(mean) %>%
mutate(features = ifelse(features == 'MTcor', 'CDcor', features))
coefs_info %>% ggplot(aes(reorder(features, mean), y = mean)) + geom_bar(stat = 'identity', fill = coefs_info$color) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.3, position=position_dodge(.9)) +
xlab('Feature') + ylab('Coefficient') +
theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'none')
To better understand the model, the coefficients of the features that represent the cluster-membership of each cluster can be contrasted to the characteristics of its corresponding cluster:
Negative coefficients correspond mainly to clusters with no enrichment in SFARI Genes and the positive coefficients to clusters with a high enrichment.
There is no visible relation between the cluster-diagnosis correlation and the magnitude of the features
plot_data = coefs_info %>% inner_join(genes_info %>% dplyr::select(Module, MTcor, pval_ORA) %>% unique,
by='Module') %>%
left_join(data.frame(table(genes_info$Module)) %>% dplyr::rename('Module' = Var1))
p1 = ggplotly(plot_data %>% ggplot(aes(x=mean, y=MTcor)) +
geom_point(aes(size=Freq, ID = coef), color=plot_data$Module, alpha = 0.5) +
geom_smooth(alpha = 0.1, color = 'gray', size = 0.5) + coord_cartesian(ylim=c(-1,1)) +
#xlab('Coefficient') + ylab('Cluster-diagnosis correlation') +
theme_minimal() + theme(legend.position='none')) %>%
layout(xaxis = list(title='Coefficient'), yaxis = list(title='Cluster-diagnosis correlation'))
p2 = ggplotly(plot_data %>% ggplot(aes(x=mean, y=1-pval_ORA)) +
geom_point(aes(size=Freq, ID = coef), color=plot_data$Module, alpha = 0.5) +
geom_smooth(alpha = 0.1, color = 'gray', size = 0.5) + coord_cartesian(ylim=c(0,1)) +
#xlab('Coefficient') + ylab('Enrichment in SFARI Genes') +
theme_minimal() + theme(legend.position='none')) %>%
layout(xaxis = list(title='Coefficient'), yaxis = list(title='Enrichment in SFARI Genes'))
subplot(p1,p2, titleX = TRUE, titleY = TRUE)
rm(p1, p2)
As the previous models, the Weighting technique assigns statistically higher probabilities to SFARI Genes than to the rest of the genes, independently of having neuronal annotations or not, and unlike in Gandal’s dataset, it continues to discriminate between SFARI Scores, assigning SFARI Score 1 higher probabilities than scores 2 and 3.
plot_data = weights_predictions %>% left_join(genes_info %>% dplyr::select(ID, gene.score), by = 'ID') %>%
mutate(SFARI_genes = ifelse(gene.score %in% c('Neuronal','Others'),as.character(gene.score),'SFARI')) %>%
mutate(SFARI_genes = factor(SFARI_genes, levels = c('SFARI','Neuronal','Others'))) %>%
apply_labels(gene.score='SFARI Gene score')
wt = plot_data %>% wilcox_test(prob~SFARI_genes, p.adjust.method = 'BH') %>% add_x_position(x = 'group')
increase = 0.05
base = 0.75
pos_y_comparisons = c(base, base+increase, base)
p1 = plot_data %>% ggplot(aes(SFARI_genes, prob)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=SFARI_genes)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Probability') + #ggtitle('Distribution of probabilities by category') +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position = 'none')
wt = plot_data %>% wilcox_test(prob~gene.score, p.adjust.method = 'BH') %>% add_x_position(x = 'group')
increase = 0.05
base = 0.75
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p2 = plot_data %>% ggplot(aes(gene.score, prob)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('') + ylab('Probability') +
#ggtitle('Distribution of probabilities by SFARI score') +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1, p2, nrow = 1)
rm(mean_vals, increase, base, pos_y_comparisons, wt)
To identify new genes that could play a role in ASD, the 10 non-SFARI genes that were assigned the highest probabilities by the Weighting technique are presented.
perc_GS = ecdf(dataset$GS)
perc_MTcor = ecdf(unique(dataset$MTcor))
top_genes = weights_predictions %>% arrange(desc(prob)) %>% filter(!SFARI) %>% top_n(n=25, wt=prob) %>%
dplyr::select(ID, hgnc_symbol, prob) %>%
left_join(dataset %>% mutate(ID=rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID') %>%
mutate(perc_GS = round(100*perc_GS(GS)), perc_MTcor = round(100*perc_MTcor(MTcor))) %>%
dplyr::select(hgnc_symbol, GS, perc_GS, MTcor, perc_MTcor, prob) %>%
mutate(prob = round(prob,3), GS = round(GS,2), MTcor = round(MTcor,2))
kable(top_genes)
| hgnc_symbol | GS | perc_GS | MTcor | perc_MTcor | prob |
|---|---|---|---|---|---|
| NA | 0.02 | 54 | 0.02 | 53 | 0.742 |
| NA | 0.16 | 71 | -0.29 | 27 | 0.733 |
| NA | 0.11 | 66 | -0.29 | 27 | 0.731 |
| NA | 0.00 | 51 | 0.02 | 53 | 0.714 |
| NA | 0.12 | 66 | -0.29 | 27 | 0.711 |
| NA | -0.06 | 44 | -0.29 | 27 | 0.705 |
| NA | -0.10 | 39 | 0.05 | 57 | 0.705 |
| NA | 0.13 | 67 | -0.29 | 27 | 0.703 |
| NA | 0.09 | 62 | -0.29 | 27 | 0.700 |
| NA | -0.09 | 40 | -0.29 | 27 | 0.698 |
| NA | -0.28 | 19 | -0.48 | 10 | 0.693 |
| NA | -0.11 | 38 | -0.29 | 27 | 0.693 |
| NA | -0.10 | 39 | -0.29 | 27 | 0.687 |
| NA | -0.23 | 23 | -0.29 | 27 | 0.687 |
| NA | -0.42 | 8 | -0.29 | 27 | 0.685 |
| NA | -0.07 | 43 | -0.29 | 27 | 0.684 |
| NA | 0.08 | 61 | -0.29 | 27 | 0.684 |
| NA | 0.02 | 55 | -0.29 | 27 | 0.683 |
| NA | 0.06 | 59 | -0.18 | 39 | 0.682 |
| NA | 0.02 | 54 | -0.29 | 27 | 0.675 |
| NA | -0.17 | 30 | -0.29 | 27 | 0.675 |
| NA | -0.12 | 36 | -0.29 | 27 | 0.671 |
| NA | 0.16 | 71 | -0.29 | 27 | 0.670 |
| NA | -0.01 | 51 | -0.29 | 27 | 0.670 |
| NA | 0.36 | 89 | -0.29 | 27 | 0.666 |
The GS percentiles of the top 10 genes have a mean of 56, and a standard deviation of 12.0185043. The Cluster-diagnosis correlation has a mean of 35.2 and a standard deviation of 13.2480607
These results show that it is possible to use gene expression dysregulation information to characterise SFARI Genes in an unbiased way, discovering that SFARI Genes have low dysregulation themselves but are associated to groups of genes with high dysregulation, and also proving that this information is useful to find new genes that have a high probability of being associated to ASD.
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.32 ROCR_1.0-7 gplots_3.0.3
## [5] rstatix_0.6.0 mgcv_1.8-36 nlme_3.1-153 expss_0.10.7
## [9] ggExtra_0.9 ggpubr_0.2.5 magrittr_2.0.1 GGally_1.5.0
## [13] colorspace_2.0-2 gridExtra_2.3 viridis_0.6.1 viridisLite_0.4.0
## [17] RColorBrewer_1.1-2 plotly_4.9.2 glue_1.4.2 reshape2_1.4.4
## [21] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1 purrr_0.3.4
## [25] readr_1.3.1 tidyr_1.1.0 tibble_3.1.2 ggplot2_3.3.5
## [29] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] ggsignif_0.6.2 ellipsis_0.3.2 rio_0.5.16 htmlTable_1.13.3
## [5] fs_1.5.0 rstudioapi_0.13 farver_2.1.0 fansi_0.5.0
## [9] lubridate_1.7.10 xml2_1.3.2 splines_3.6.3 jsonlite_1.7.2
## [13] broom_0.7.0 dbplyr_1.4.2 shiny_1.6.0 compiler_3.6.3
## [17] httr_1.4.2 backports_1.2.1 assertthat_0.2.1 Matrix_1.2-18
## [21] fastmap_1.1.0 lazyeval_0.2.2 cli_3.0.1 later_1.3.0
## [25] htmltools_0.5.2 tools_3.6.3 gtable_0.3.0 Rcpp_1.0.7
## [29] carData_3.0-4 cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [33] gdata_2.18.0 crosstalk_1.1.1 xfun_0.25 openxlsx_4.2.4
## [37] rvest_0.3.5 mime_0.11 miniUI_0.1.1.1 lifecycle_1.0.0
## [41] gtools_3.9.2 scales_1.1.1 hms_1.1.0 promises_1.2.0.1
## [45] yaml_2.2.1 curl_4.3.2 sass_0.4.0 reshape_0.8.8
## [49] stringi_1.7.4 highr_0.9 checkmate_2.0.0 caTools_1.18.2
## [53] zip_2.2.0 rlang_0.4.11 pkgconfig_2.0.3 matrixStats_0.60.1
## [57] bitops_1.0-7 evaluate_0.14 lattice_0.20-41 labeling_0.4.2
## [61] htmlwidgets_1.5.3 tidyselect_1.1.1 plyr_1.8.6 R6_2.5.1
## [65] generics_0.1.0 DBI_1.1.1 pillar_1.6.2 haven_2.2.0
## [69] foreign_0.8-76 withr_2.4.2 abind_1.4-5 modelr_0.1.6
## [73] crayon_1.4.1 car_3.0-7 KernSmooth_2.23-17 utf8_1.2.2
## [77] rmarkdown_2.7 grid_3.6.3 readxl_1.3.1 data.table_1.14.0
## [81] webshot_0.5.2 reprex_0.3.0 digest_0.6.27 xtable_1.8-4
## [85] httpuv_1.6.1 munsell_0.5.0 bslib_0.3.0